Loading Packages & Initialization

In [2]:
rm(list=ls())

library(data.table)
library(tidyverse)
library(rJava)
library(RNetLogo)

library(lhs) # For maximin Latin hypercube sampling
library(ggplot2)
library(plotly) # For beautiful plotting
library(caret)
library(randomForest)
library(factoextra)
library(e1071)
library(TSrepr) # for evaluating predictive power

require(gridExtra)

options(warn = -1)
In [3]:
# Select if data generation is wanted
GenerateTTData <- 0
In [4]:
Is_Headless <- 1
nl.model <- "Segregation_Dummy"

nl.path <- "C:/Program Files/NetLogo 6.0.4/app"
model.path <- paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/",nl.model,".nlogo")

if (Is_Headless == 0){
    NLStart(nl.path, gui = TRUE,nl.jarname='netlogo-6.0.4.jar')
    NLLoadModel (model.path)
    } else {
    NLStart(nl.path, gui = FALSE,nl.jarname='netlogo-6.0.4.jar', nl.obj = nl.model)
    NLLoadModel (model.path, nl.obj = nl.model)
    
    #NLStart(nl.path, gui = FALSE,nl.jarname='netlogo-6.0.4.jar', nl.obj = nl.model)
    #NLLoadModel (model.path, nl.obj = nl.model )
    }

Model Parameters & Functions

Set model parameters

In [5]:
set.seed(1)

## Set model parameters
 # Number of replications for each instance
nofrep = 10     

# Number of iterations
iteration_budget = 20
 # order feature names according to their definition order in run_model
feature_names = c("density","%-similar-wanted","budget-multiplier-dummy","density-multiplier-dummy","noise-dummy","tick-limit")  
 # 
output_name = c("percent-similar")

 # Number of input parameters of the agent-based model
nofparams = length(feature_names)      

# set RF parameters
ntree = 400
mtry = 2

Set user parameters

In [6]:
error_type = "RMSE" # MAPE, BIAS

# choose the uncertainty measure
selection_metric <- "sd" #, "range" 

unlabeled_ins = 700 
test_ins = 400
train_ins_oneshot = 700
train_ins_Ad = 200

# Set selection parameters
selected_ins = 5 #nofinstancesWillbeSelected in each step

# Set elimination parameters
h <- 1 # number of variables eliminated in each step

Define functions

run_model

In [7]:
#run_model <- function(feature_names,feature_values){ # both should be in character list format
run_model <- function(feature_values){ # both should be in character list format

    
    k = length(feature_names)    
    for(i in 1:k){
        NLCommand(paste0("set ",feature_names[i]," ",feature_values[i]), nl.obj = nl.model)      
    }
    NLCommand("setup", nl.obj = nl.model)
    NLDoCommand(100, "go", nl.obj = nl.model) 
    result <- NLReport(output_name, nl.obj = nl.model)
    return(result)   
}

run_replicas

In [8]:
#run_replicas <- function(nofrep,feature_names,feature_values) {
run_replicas <- function(nofrep,feature_values) {
    replicas = matrix(NA, ncol = nofrep, nrow = 1) # Save the result of each replication
    for(i in 1:nofrep){
     #   replicas[i]= run_model(feature_names,feature_values)
        replicas[i]= run_model(feature_values)
    }
    aggregated_result = mean(replicas)
    return(aggregated_result)
}

run_ABM

In [9]:
#run_ABM = function(nofrep,nofinstances,unlabeledset,featurenames = feature_names){
run_ABM = function(nofrep,nofinstances,unlabeledset){
   #unlabeledset = setcolorder(unlabeledset,featurenames) 
   unlabeledset = setcolorder(unlabeledset,feature_names) 
   for(i in 1:nofinstances){
        #unlabeledset[i, output :=  run_replicas(nofrep,featurenames, as.matrix(unlabeledset[i,]))]    
        unlabeledset[i, output :=  run_replicas(nofrep, as.matrix(unlabeledset[i,]))] 
    } 
    return(unlabeledset)
}

error functions

In [10]:
#error functions on test data
rmse_func <- function(actual, predicted){
    error = predicted - actual
    return(sqrt(mean(error^2)))
}

mape_func <- function(actual,predicted){
    return( (abs(actual - predicted)/ actual)*100 )
}

bias_func <- function(actual,predicted){
    return( (actual - predicted)/ actual )
}

#error functions on train data
obb_error_func <- function(model){
   if(model$type == "regression"){
        oob_error = model$mse[model$ntree] 
    }else if(model$type == "classification"){
        oob_error = model$err.rate 
    } 
    return(oob_error)
}

get_test_predictions

In [11]:
# prediction functions
get_test_predictions <- function(model,testset,errortype){
    
    predictedLabels <- predict(model, testset)
    predictedLabels <- cbind(testset,predictedLabels)
    setnames(predictedLabels, "predictedLabels","pred_output")

    output_variables = colnames(select(predictedLabels, contains("output")))
    # output_variables[1] = true output
    # output_variables[2] = predicted output
    
    #output_variables = colnames(predictedLabels[,1:(ncol(predictedLabels) - 2)])
    
    if(error_type == "MAPE"){
        predictedLabels[,MAPE := mapply(function(x,y) mape_func(x,y),get(output_variables[1]),get(output_variables[2]))]
          }
    if(error_type == "RMSE"){
        predictedLabels[,RMSE := mapply(function(x,y) rmse_func(x,y),get(output_variables[1]),get(output_variables[2]))]
          }
    if(error_type == "BIAS"){
        predictedLabels[,BIAS := mapply(function(x,y) bias_func(x,y),get(output_variables[1]),get(output_variables[2]))]
           } 
                                  
     output_variables_1 = predictedLabels[,get(output_variables[1]), with = TRUE]
     output_variables_2 = predictedLabels[,get(output_variables[2]), with = TRUE]
    
     performance_temp = matrix(c(1:3), nrow = 1, ncol = 3)
     performance_temp[1] =  mae(output_variables_1 , output_variables_2)
     performance_temp[2] = rmse(output_variables_1 , output_variables_2)
     performance_temp[3] = mape(output_variables_1 , output_variables_2)
    
    return(list(predictedLabels,performance_temp,output_variables))
    
}

sample_selection

In [12]:
# Adaptive sample selection function with an uncertainty measure depending on "selection_metric"
sample_selection <- function(selected_ins,unlabeled_set,model){
    ind_pred <- t(predict(model, unlabeled_set,predict.all = TRUE)$individual) %>%
                data.table() # predictions by each tree in the forest
    ind_pred_eval = data.table()
    
    # standard deviation calculation
    s_dev = sapply(ind_pred, sd) %>% data.table()
    setnames(s_dev,".","sd")
    ind_pred_eval = cbind(ind_pred_eval,s_dev)
    
    # range calculation
    range = sapply(ind_pred, range) %>% t() %>% data.table()
    range = range[,.(range = abs(range[,1] - range[,2]))]
    setnames(range,"range.V1","range")
    ind_pred_eval = cbind(ind_pred_eval,range)
        
    ind_pred_eval[,idx := 1:.N]
    
    if(selection_metric == "sd") {
      ind_pred_eval = ind_pred_eval[order(-sd)][1:selected_ins]
    }else if(selection_metric == "range"){
      ind_pred_eval = ind_pred_eval[order(-range)][1:selected_ins]
    }
    
    unlabeled_set[,idx := 1:.N]    
    train_candidates = unlabeled_set[ind_pred_eval$idx]
    
    return(train_candidates)
}

random_sample_selection

In [13]:
# Random sample selection
random_sample_selection <- function(selected_ins,unlabeled_set){
  
    unlabeled_set[,idx := 1:.N]
    
    train_candidate_idx = sample(unlabeled_set$idx, selected_ins, replace = FALSE, prob = NULL)   
    train_candidates = unlabeled_set[idx %in% train_candidate_idx]
    
    return(train_candidates)
}

get_variable_importance

In [14]:
get_variable_importance <- function(model){
    importances <- importance(model, type = 1, scale = FALSE)
    selected.vars <- order(importances, decreasing = TRUE)
    ranked_features = feature_names[selected.vars]
    ordered.importances <- importances[selected.vars]
    
    return(ranked_features)
}

feature_elimination

In [15]:
feature_elimination <- function(h,total_numof_eliminated_vars,ranked_features){ 
    numof_columns_left = length(ranked_features) - (total_numof_eliminated_vars + h)
    columns_left = ranked_features[1:numof_columns_left]
    
    eliminated_columns = setdiff((length(ranked_features) - total_numof_eliminated_vars), numof_columns_left)
    eliminated_columns = ranked_features[eliminated_columns]
    
    # update total_numof_eliminated_vars
    total_numof_eliminated_vars = length(ranked_features) - length(columns_left)
    
    return(list(columns_left,total_numof_eliminated_vars,h,eliminated_columns))
 }

Generate Unlabeled Data Pool

Latin hyper cube sampling

In [16]:
if(GenerateTTData == 1){
    set.seed(1)
    unlabeled_pool = as.data.table(maximinLHS(n = unlabeled_ins, k = nofparams, dup = 5))
    
    unlabeled_pool$V1 = qunif(unlabeled_pool$V1, 10, 90) 
    unlabeled_pool$V2 = qunif(unlabeled_pool$V2, 10, 90)
    unlabeled_pool$V3 = qunif(unlabeled_pool$V3, 1, 10) 
    unlabeled_pool$V4 = qunif(unlabeled_pool$V4, 0.01, 1)    
    unlabeled_pool$V5 = qunif(unlabeled_pool$V5, 0.00001, 0.0001) 
    unlabeled_pool$V6 = qunif(unlabeled_pool$V6, 90, 110)  
    
    setnames(unlabeled_pool, c(paste0("V",1:nofparams)), feature_names)
    
    unlabeled_pool[,idx := 1:.N]
        
    fwrite(unlabeled_pool, paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/unlabeled_pool_",Sys.Date(),".csv"))
}else{
    unlabeled_pool <- fread("C:/Users/paslanpatir/Desktop/TEZ_v2/unlabeled_pool_2019-12-20.csv")   
}
In [ ]:
data_candidates =copy(unlabeled_pool)
In [17]:
pca_unlabeled_pool <- princomp(unlabeled_pool[,-c("idx")], cor = TRUE, scores = TRUE)
pca_unlabeled_pool_components <- get_pca_ind(pca_unlabeled_pool)
p_unlabeled_pool <- ggplot(data = data.table(pca_unlabeled_pool_components$coord[,1:2]), aes(x = Dim.1, y = Dim.2)) +
                    geom_point() +
                    labs( title = "") 
p_unlabeled_pool

Generate Test Set

In [18]:
if(GenerateTTData == 1){
    set.seed(2)
    test_set <- head(unlabeled_pool,test_ins)
    
    ################## Buraya variale'ların datatipine göre bir şeyler yazılabilir
    test_set$density                    = runif(test_ins, 10, 90) 
    test_set$`%-similar-wanted`         = runif(test_ins, 10, 90) 
    test_set$`budget-multiplier-dummy`  = runif(test_ins, 1, 10) 
    test_set$`density-multiplier-dummy` = runif(test_ins, 0.01, 1) 
    test_set$`noise-dummy`              = runif(test_ins, 0.00001, 0.0001) 
    test_set$`tick-limit`               = runif(test_ins, 90, 110) 
    
    test_set[,c("idx"):= NULL]
      
    print(paste0("ABM run start time : ",Sys.time()))
    test_set = run_ABM(nofrep,test_ins,test_set) %>% as.data.table()
    print(paste0("ABM run end time : ",Sys.time()))
    
    fwrite(test_set, paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/test_set_",Sys.Date(),".csv"))
}else{
    test_set <- fread("C:/Users/paslanpatir/Desktop/TEZ_v2/test_set_2019-12-20.csv")  
}

10 10 ~ 1 min 100 10 ~ 14 min 900 * 10 ~ 09:16 -- 2019-12-03 07:54:10 +03"

In [19]:
pca_test_set <- princomp(test_set, cor = TRUE, scores = TRUE)
pca_test_set_components <- get_pca_ind(pca_test_set)
p_test_set <- ggplot(data = data.table(pca_test_set_components$coord[,1:2]), aes(x = Dim.1, y = Dim.2)) +
                    geom_point() +
                    labs( title = "") 
p_test_set

Benchmark : One-shot sampling, No feature elimination

Generate Training Set

Select a very big data pool ( nofinstances should be very high ) , like 1000

In [20]:
if(GenerateTTData == 1){
    set.seed(3)
    training_set = as.data.table(maximinLHS(n = train_ins_oneshot, k = nofparams, dup = 5))
    
    training_set$V1 = qunif(training_set$V1, 10, 90) 
    training_set$V2 = qunif(training_set$V2, 10, 90) 
    training_set$V3 = qunif(training_set$V3, 1, 10) 
    training_set$V4 = qunif(training_set$V4, 0.01, 1)    
    training_set$V5 = qunif(training_set$V5, 0.00001, 0.0001) 
    training_set$V6 = qunif(training_set$V6, 90, 110)  
    
    setnames(training_set, c(paste0("V",1:nofparams)), feature_names)
   
    training_set$output <- 0.00
    
    print(paste0("ABM run start time : ",Sys.time()))
    training_set = run_ABM(nofrep,train_ins_oneshot,LHSample) %>% as.data.table()
    print(paste0("ABM run end time : ",Sys.time()))
    
    fwrite(training_set, paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/training_set_",Sys.Date(),".csv"))
    
}else{
    training_set <- fread("C:/Users/paslanpatir/Desktop/TEZ_v2/LHSample_Data_2019-12-20.csv")
}
In [21]:
one_shot_data = copy(training_set)

Visualization

In [22]:
pca_training_set <- princomp(training_set[,.SD, .SDcols = !c("output")], cor = TRUE, scores = TRUE)

#fviz_pca_ind(pca_LHSample,
#             col.ind = "cos2", # Color by the quality of representation
#             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
#              geom="point"
#             )

pca_training_set_components <- get_pca_ind(pca_training_set)
pca_training_set_components <-cbind(pca_training_set_components$coord[,1:2],training_set[,.SD, .SDcols = c("output")])
p_training_set <- ggplot(data = pca_training_set_components, aes(x = Dim.1, y = Dim.2)) +
             geom_point(aes(colour = output)) +
             labs( title = "", legend = "output") 
p_training_set

Train & Test Metamodel

In [23]:
model_oneshot <- randomForest(x = training_set[, -c("output")], y = training_set$output, importance = TRUE,ntree = ntree, mtry = mtry)
model_oneshot
Call:
 randomForest(x = training_set[, -c("output")], y = training_set$output,      ntree = ntree, mtry = mtry, importance = TRUE) 
               Type of random forest: regression
                     Number of trees: 400
No. of variables tried at each split: 2

          Mean of squared residuals: 28.61753
                    % Var explained: 90.58
In [24]:
obb_error_oneshot <- obb_error_func(model_oneshot)
In [ ]:
#OBB_pred = cbind(training_set$output,model_oneshot$predicted)
#names(OBB_pred) <- c("actual","predicted")
In [25]:
plot(model_oneshot$mse, type="l")
In [26]:
test_prediction_oneshot = get_test_predictions(model_oneshot,test_set,error_type)
predictedLabels_oneshot = test_prediction_oneshot[[1]]

performance_table_oneshot = data.table(iter = numeric(), mae= numeric(),rmse= numeric(), mape = numeric())
# Keep test set error records
performance_table_oneshot = rbind(performance_table_oneshot, data.table(1, test_prediction_oneshot[[2]]), use.names = FALSE)

output_variables = test_prediction_oneshot[[3]]
In [27]:
performance_table_oneshot
obb_error_oneshot
head(predictedLabels_oneshot)
A data.table: 1 × 4
itermaermsemape
<dbl><dbl><dbl><dbl>
13.8544225.0487575.225406
28.6175339273172
A data.table: 6 × 9
density%-similar-wantedbudget-multiplier-dummydensity-multiplier-dummynoise-dummytick-limitoutputpred_outputRMSE
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
89.8078163.221019.5593640.71832329.772682e-05 94.8546198.6208783.9097614.7111036
65.9530427.517301.1139790.46118304.533282e-05 91.3538366.0909070.59657 4.5056738
65.6374842.720067.4602210.24982484.748342e-05106.6083286.5381086.40176 0.1363461
49.0847453.361871.0291110.32092412.648540e-05 95.3160297.0672088.54421 8.5229848
82.3166632.863499.9374370.74247809.336225e-05 91.0006673.9935671.21197 2.7815976
59.0871922.267415.2813720.73510981.573031e-05 98.7811463.6766866.39168 2.7150012
In [ ]:
performance_molten_oneshot <- melt(data = performance_table_oneshot
                             , id.vars = 'iter')
setnames(performance_molten_oneshot, c("variable","value"),c("errortype","errorvalue"))
In [28]:
p_oneshot <- ggplot(predictedLabels_oneshot,aes(x = get(output_variables[1]), y = get(output_variables[2]), color = (get(output_variables[2]) - get(output_variables[1])))) +
            geom_point() +
            geom_abline() +
            xlab("actual values") +
            ylab("fitted values")

p_oneshot

Random Sampling & No Feature Elimination

Generate Training Set

Select a relatively big data pool ( nofinstances should be medium) , like 400

In [29]:
if(GenerateTTData == 1){
    set.seed(4)   
    training_set_Ad = as.data.table(maximinLHS(n = train_ins_Ad, k = nofparams, dup = 5))
    
    training_set_Ad$V1 = qunif(training_set_Ad$V1, 10, 90) 
    training_set_Ad$V2 = qunif(training_set_Ad$V2, 10, 90)
    training_set_Ad$V3 = qunif(training_set_Ad$V3, 1, 10) 
    training_set_Ad$V4 = qunif(training_set_Ad$V4, 0.01, 1)    
    training_set_Ad$V5 = qunif(training_set_Ad$V5, 0.00001, 0.0001) 
    training_set_Ad$V6 = qunif(training_set_Ad$V6, 90, 110) 
    
    setnames(training_set_Ad, c(paste0("V",1:nofparams)), feature_names)
    training_set_Ad$output <- 0.00
    
    print(paste0("ABM run start time : ",Sys.time()))
    training_set_Ad = run_ABM(nofrep,train_ins_Ad,training_set_Ad) %>% as.data.table()
    print(paste0("ABM run end time : ",Sys.time()))
    
    fwrite(training_set_Ad, paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/training_set_Ad",Sys.Date(),".csv"))

}else{
    training_set_Ad <- fread("C:/Users/paslanpatir/Desktop/TEZ_v2/LHSample_Ad_Data_2019-12-20.csv")
}
In [30]:
adaptive_initial_data = copy(training_set_Ad)

Visualization

In [86]:
pca_training_set_Ad <- princomp(adaptive_initial_data[,.SD, .SDcols = !c("output")], cor = TRUE, scores = TRUE)

pca_training_set_Ad_components <- get_pca_ind(pca_training_set_Ad)
pca_training_set_Ad_components <-cbind(pca_training_set_Ad_components$coord[,1:2],adaptive_initial_data[,.SD, .SDcols = c("output")])
p_training_set_Ad <- ggplot(data = pca_training_set_Ad_components, aes(x = Dim.1, y = Dim.2)) +
             geom_point(aes(colour = output)) +
             labs( title = "", legend = "output") 
p_training_set_Ad

Train & Test Metamodel

In [31]:
# Decide on strategy:
#iteration_budget = 3   #specified above

## initialize record tables Record train candidates
train_candidates_table = data.table()

# Record model performances
performance_table = data.table(iter = numeric(), mae = numeric(), rmse = numeric(), mape = numeric())

# Record obb_error table
obb_error = data.table(iter = numeric(), obb_error = numeric())

## initialize variables
# keep test set undistorted
predictedLabels_table = copy(test_set)
In [32]:
set.seed(10)
print(paste0("section start time : ",Sys.time()))
iter = 1
while(iter <= iteration_budget){   
    print(iter)

    trainx = training_set_Ad[,.SD, .SDcols = feature_names]
    trainy = training_set_Ad$output
    
    # Train the model
    model_Sub <- randomForest( x = trainx, y =  trainy,importance = TRUE,ntree = ntree, mtry = mtry)
    assign(paste0("model_Sub_",iter),model_Sub)
                     
    obb_error = rbind(obb_error,data.table(iter,obb_error_func(model_Sub)),use.names=FALSE)
    
    # test the model on test set
    test_predictions_Sub = get_test_predictions(model_Sub,test_set,error_type)
    predictedLabels_Sub = test_predictions_Sub[[1]]
    setnames(predictedLabels_Sub,c("pred_output",error_type), c(paste0("pred_output_",iter),paste0(error_type,"_",iter)))    
    predictedLabels_table = cbind(predictedLabels_table,predictedLabels_Sub[,.SD, .SDcols = c(paste0("pred_output_",iter),paste0(error_type,"_",iter))])
    
    # Keep test set error records
    performance_table = rbind(performance_table,data.table(iter,test_predictions_Sub[[2]]), use.names = FALSE)    

    if(iter != iteration_budget){ # below efforts are unnecessary when the budget is reached.
        
    ## sample selection from unlabeled data select candidates
    unlabeled_set <- copy(unlabeled_pool)
    train_candidates = random_sample_selection(selected_ins,unlabeled_set)
        
    # Eliminate train candidates from the unlabeled pool
    unlabeled_pool = unlabeled_pool[- train_candidates$idx]
    rm(unlabeled_set)
    
    # run ABM to find outputs of train candidates
    print(paste0("ABM train_candidate run start time : ",Sys.time()))
    train_candidates = run_ABM(nofrep,selected_ins,train_candidates)
    print(paste0("ABM train_candidate run end time : ",Sys.time()))
    
    train_candidates_table = rbind(train_candidates_table, data.table(train_candidates,iter = iter))

    # Add new data to train data
    training_set_Ad = rbind(training_set_Ad,train_candidates[,-c("idx")])
    }
    iter = iter + 1
}

# plot koy her iteration'da göstersin.
#setcolorder(data,variableorder) ################# bunu bi yerlere koyman gerekebilir, dikkat!!
print(paste0("section end time : ",Sys.time()))
[1] 1
[1] "ABM train_candidate run start time : 2020-01-08 20:17:06"
[1] "ABM train_candidate run end time : 2020-01-08 20:18:51"
[1] 2
[1] "ABM train_candidate run start time : 2020-01-08 20:18:52"
[1] "ABM train_candidate run end time : 2020-01-08 20:19:32"
[1] 3
[1] "ABM train_candidate run start time : 2020-01-08 20:19:33"
[1] "ABM train_candidate run end time : 2020-01-08 20:20:54"
[1] 4
[1] "ABM train_candidate run start time : 2020-01-08 20:20:54"
[1] "ABM train_candidate run end time : 2020-01-08 20:21:59"
[1] 5
[1] "ABM train_candidate run start time : 2020-01-08 20:21:59"
[1] "ABM train_candidate run end time : 2020-01-08 20:22:23"
[1] 6
[1] "ABM train_candidate run start time : 2020-01-08 20:22:24"
[1] "ABM train_candidate run end time : 2020-01-08 20:23:55"
[1] 7
[1] "ABM train_candidate run start time : 2020-01-08 20:23:55"
[1] "ABM train_candidate run end time : 2020-01-08 20:24:43"
[1] 8
[1] "ABM train_candidate run start time : 2020-01-08 20:24:43"
[1] "ABM train_candidate run end time : 2020-01-08 20:25:06"
[1] 9
[1] "ABM train_candidate run start time : 2020-01-08 20:25:07"
[1] "ABM train_candidate run end time : 2020-01-08 20:25:49"
[1] 10
[1] "ABM train_candidate run start time : 2020-01-08 20:25:54"
[1] "ABM train_candidate run end time : 2020-01-08 20:26:36"
[1] 11
[1] "ABM train_candidate run start time : 2020-01-08 20:26:36"
[1] "ABM train_candidate run end time : 2020-01-08 20:27:23"
[1] 12
[1] "ABM train_candidate run start time : 2020-01-08 20:27:25"
[1] "ABM train_candidate run end time : 2020-01-08 20:27:59"
[1] 13
[1] "ABM train_candidate run start time : 2020-01-08 20:28:00"
[1] "ABM train_candidate run end time : 2020-01-08 20:28:21"
[1] 14
[1] "ABM train_candidate run start time : 2020-01-08 20:28:22"
[1] "ABM train_candidate run end time : 2020-01-08 20:30:37"
[1] 15
[1] "ABM train_candidate run start time : 2020-01-08 20:30:37"
[1] "ABM train_candidate run end time : 2020-01-08 20:31:20"
[1] 16
[1] "ABM train_candidate run start time : 2020-01-08 20:31:21"
[1] "ABM train_candidate run end time : 2020-01-08 20:32:05"
[1] 17
[1] "ABM train_candidate run start time : 2020-01-08 20:32:06"
[1] "ABM train_candidate run end time : 2020-01-08 20:32:54"
[1] 18
[1] "ABM train_candidate run start time : 2020-01-08 20:32:54"
[1] "ABM train_candidate run end time : 2020-01-08 20:33:11"
[1] 19
[1] "ABM train_candidate run start time : 2020-01-08 20:33:11"
[1] "ABM train_candidate run end time : 2020-01-08 20:33:27"
[1] 20

started : 2020-01-08 20:17:06 // ended : 2020-01-08 20:33:27 // 10 nofrep 5 sample 19 selection iter = 950 runs

In [33]:
# Final records
FinalTrainData_Rd = copy(training_set_Ad)
performance_table_Rd = copy(performance_table)
train_candidates_table_Rd  = copy(train_candidates_table)
predictedLabels_table_Rd = copy(predictedLabels_table)
obb_error_Rd = copy(obb_error)
In [36]:
fwrite(FinalTrainData_Rd,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/FinalTrainData_Rd_DummyCode_",Sys.Date(),".csv") )
fwrite(performance_table_Rd,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/performance_table_Rd_DummyCode_",Sys.Date(),".csv") )
fwrite(train_candidates_table_Rd,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/train_candidates_table_Rd_DummyCode_",Sys.Date(),".csv") )
fwrite(predictedLabels_table_Rd,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/predictedLabels_table_Rdd_DummyCode_",Sys.Date(),".csv") )
fwrite(obb_error_Rd,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/obb_error_Rd_DummyCode_",Sys.Date(),".csv") )
In [35]:
# show results
nrow(FinalTrainData_Rd)
performance_table_Rd 
train_candidates_table_Rd  
head(predictedLabels_table_Rd)
obb_error_Rd
295
A data.table: 20 × 4
itermaermsemape
<dbl><dbl><dbl><dbl>
16.2436847.8494678.775010
26.1766377.7795928.683802
36.1108927.6510798.620170
46.0119027.5057488.398487
56.1631147.6364938.615123
66.1451067.5502898.628307
76.0208547.4014758.431386
85.9536837.3193868.357513
95.9561937.3205578.369063
105.5979286.9594377.834071
115.6144486.9160077.851498
125.7290507.0657508.016023
135.7609867.1225358.037418
145.6385506.9718847.898726
155.3531836.6474787.464167
165.3498686.6984307.403973
175.4908096.7885547.573871
185.3512786.6719107.409060
195.5288056.8310437.638558
205.3667336.7217507.406430
A data.table: 95 × 9
density%-similar-wantedbudget-multiplier-dummydensity-multiplier-dummynoise-dummytick-limitidxoutputiter
<dbl><dbl><dbl><dbl><dbl><dbl><int><dbl><dbl>
71.2881251.465402.1402000.249789513.648172e-05 94.55682 57 95.256441
25.8707271.936497.5513670.591464572.936624e-05101.52524114 99.899321
30.7938440.947365.0576920.547360485.046197e-05107.57766145 92.502351
81.3753081.281519.2895250.511144748.039323e-05 93.96923261 51.398901
55.3471745.923105.9619400.335380867.345333e-05 99.55940669 90.257911
23.7083085.228348.4641880.519916027.280021e-05107.23067398 64.663762
43.9479551.139972.6771450.096016834.789239e-05103.33558429 97.471212
38.1078266.356121.6252960.785083936.034256e-05106.57437430 99.129072
16.6845017.944566.9354890.970665186.493580e-05 91.91217467 81.840512
69.5606232.155451.3455360.060667981.668196e-05101.45277584 72.992852
78.7822329.752523.5229750.173418053.033961e-05 96.49102185 73.440113
74.1823538.337217.7897360.714420039.468610e-05 97.69689213 81.532443
72.8623276.504951.8402880.791870223.485970e-05 96.02683237 53.280673
36.9888276.300042.9481290.648756657.577111e-05 99.00537367 70.764003
38.1469318.364274.2852290.053201122.038089e-05 97.60585464 64.211853
77.5776970.295955.3814720.166441253.588723e-05103.67594 43 99.671764
42.1871129.881803.1367540.861058775.433321e-05 93.50062149 76.649224
25.9156034.208303.7434510.322126453.915094e-05100.13414279 92.059474
55.0375285.114079.0696900.480883399.410761e-05100.08649427 51.438064
10.3052657.400074.0073210.169388095.603003e-05103.06316597100.000004
79.4545849.099175.2873830.446959735.786716e-05103.91660 71 88.048135
48.2580949.822155.4093940.865724777.425128e-05104.62163106 90.686575
80.3165753.424547.0840640.224170832.094673e-05103.94898268 94.473905
62.8754953.298061.7385690.988118524.330548e-05109.23899494 95.663855
13.6914279.246794.8822070.919984498.973598e-05107.78014578100.000005
41.1943843.545578.5443980.227090247.378919e-05100.53919 91 91.650266
84.1799280.218623.6702420.256262133.396680e-05 90.87091305 51.373356
75.8980250.052497.9128260.643032913.102479e-05104.78767321 95.032726
22.2851723.094069.2561040.080754795.360178e-05 93.08863347 77.451446
30.1073568.416061.6470210.478389959.758411e-05106.42074453 99.873266
...........................
82.7371024.242047.3105490.344682463.728506e-05104.36580230 58.3065114
76.7536688.862625.5835720.989494395.849406e-05104.06978521 50.2276814
56.2809182.520712.0741320.067063065.985505e-05 92.17322536 51.7904014
88.1915714.542006.0918230.295950404.393429e-05 92.31813565 55.0324414
15.0683210.394521.8133440.179670071.074826e-05 90.53788631 81.3886914
64.0630913.141997.7708210.670805612.658880e-05104.32262214 53.5228115
50.6084216.901923.8839840.956287002.654616e-05 94.19470249 60.4284615
74.0565578.220536.5028940.757722382.507659e-05102.41128254 53.4670315
60.6179615.662001.1582430.954383899.106358e-05109.14396353 55.3517715
17.9703686.041959.2388760.037576144.398792e-05 97.03482401 81.6949215
64.5789124.544535.3937420.813704617.837151e-05106.11559157 63.0847616
33.1800731.485517.2035790.384925235.166390e-05103.54598391 79.7269116
18.4135879.443741.4290130.707652256.009265e-05 92.43321431 86.7490116
55.4082724.091011.3494430.997360391.484829e-05100.04629444 65.3522016
50.8005712.664021.1881390.698078994.460081e-05108.60291471 57.4248716
44.1700566.608322.7732690.021008428.929845e-05107.76953343 99.0965717
65.0880385.781781.0909530.607554111.204409e-05101.28025356 50.3716517
34.6504354.317032.0129080.918210861.105262e-05105.03771433 98.1454317
89.1248720.552583.3947640.938895977.867734e-05 93.53060447 57.4933917
53.7812489.045194.3789290.857177921.545563e-05 98.87193459 50.6389617
20.9995863.382966.7814780.171807346.428871e-05103.69223153 99.6091318
70.4227148.171129.0219160.578869843.314637e-05 94.97180304 88.4961618
53.6090757.967921.0037700.890691042.733878e-05 98.58536357 97.0434518
81.8280336.030678.3323240.328377526.137822e-05 98.96806362 77.9072118
29.7194422.885599.9842340.417995351.912512e-05107.39988413 72.0663618
62.4337246.505203.8966350.386562093.765022e-05104.90959 10 89.2224819
47.0143535.337603.7913960.650267747.622026e-05101.34832 15 84.4759719
22.6776746.172424.6121450.333755552.008172e-05105.43650 78 93.6143619
43.1186157.704133.4422560.522126092.995916e-05 97.32172225 97.6675919
12.6254676.220903.2275100.121989291.229947e-05107.46005544100.0000019
A data.table: 6 × 47
density%-similar-wantedbudget-multiplier-dummydensity-multiplier-dummynoise-dummytick-limitoutputpred_output_1RMSE_1pred_output_2...pred_output_16RMSE_16pred_output_17RMSE_17pred_output_18RMSE_18pred_output_19RMSE_19pred_output_20RMSE_20
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>...<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
89.8078163.221019.5593640.71832329.772682e-05 94.8546198.6208779.8729518.74791378.96869...81.9920916.628776080.9818917.63897880.7487917.87207980.8928317.72803480.6459217.974944
65.9530427.517301.1139790.46118304.533282e-05 91.3538366.0909074.45472 8.36382175.32002...72.95432 6.863420572.23863 6.14773573.00030 6.90940872.83134 6.74044773.31796 7.227062
65.6374842.720067.4602210.24982484.748342e-05106.6083286.5381089.91434 3.37623989.91900...88.63943 2.101329688.05757 1.51947287.97737 1.43926788.65271 2.11460788.45601 1.917910
49.0847453.361871.0291110.32092412.648540e-05 95.3160297.0672092.35617 4.71102992.97841...90.52765 6.539543790.00928 7.05791688.85407 8.21313090.47951 6.58768690.66949 6.397709
82.3166632.863499.9374370.74247809.336225e-05 91.0006673.9935671.20232 2.79124871.69961...73.01726 0.976308172.32583 1.66773472.04123 1.95233271.56738 2.42618572.40483 1.588733
59.0871922.267415.2813720.73510981.573031e-05 98.7811463.6766870.01950 6.34282070.10741...66.69868 3.022003065.71863 2.04195565.49788 1.82120067.39739 3.72071765.33554 1.658867
A data.table: 20 × 2
iterobb_error
<dbl><dbl>
168.52227
264.30361
366.29439
463.92354
562.68985
662.55594
761.40064
861.55978
959.05219
1055.67706
1155.07549
1257.29888
1358.27891
1456.28211
1551.84444
1654.45316
1751.91830
1852.68037
1951.80353
2053.29537
In [87]:
performance_molten_Rd <- melt(data = performance_table_Rd
                             , id.vars = 'iter')
setnames(performance_molten_Rd, c("variable","value"),c("errortype","errorvalue"))
p_Rd = ggplot(performance_molten_Rd, aes(x = iter, y = errorvalue, group=errortype, col=errortype)) + 
          geom_line(lwd=1)+
          geom_hline(data = performance_molten_oneshot, aes(yintercept = errorvalue, group=errortype, col=errortype),stat = "hline", linetype = "dashed")
p_Rd

Final Visualization

In [88]:
pca_final_Rd_training_set <- princomp(FinalTrainData_Rd[,.SD, .SDcols = !c("output")], cor = TRUE, scores = TRUE)

pca_final_Rd_training_set_components <- get_pca_ind(pca_final_Rd_training_set)
pca_final_Rd_training_set_components <-cbind(pca_final_Rd_training_set_components$coord[,1:2],FinalTrainData_Rd[,.SD, .SDcols = c("output")])
p_final_Rd_training_set <- ggplot(data = pca_final_Rd_training_set_components, aes(x = Dim.1, y = Dim.2)) +
             geom_point(aes(colour = output)) +
             labs( title = "", legend = "output") 
p_final_Rd_training_set
In [89]:
grid.arrange(p_training_set_Ad,p_final_Rd_training_set, ncol=2)

Adaptive Sampling & No Feature Elimination

Generate Training Set

Select a relatively big data pool ( nofinstances should be medium) , like 400

In [37]:
training_set_Ad = copy(adaptive_initial_data)
unlabeled_pool =copy(data_candidates)
In [ ]:
#if(GenerateTTData == 1){
#   
#    LHSample_Ad = as.data.table(maximinLHS(n = train_ins_Ad, k = nofparams, dup = 5))
#    
#    LHSample_Ad$V1 = qunif(LHSample_Ad$V1, 10, 90) 
#    LHSample_Ad$V2 = qunif(LHSample_Ad$V2, 10, 90) 
#    setnames(LHSample_Ad, c("V1","V2"), feature_names)
#    LHSample_Ad$output <- 0.00
#    
#    paste0("ABM run start time : ",Sys.time())
#    LHSample_Ad = run_ABM(nofrep,train_ins_Ad,LHSample_Ad) %>% as.data.table()
#    paste0("ABM run end time : ",Sys.time())
#    
#    fwrite(LHSample_Ad, paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/LHSample_Ad_Data",Sys.Date(),".csv"))
#
#}else{
#    LHSample_Ad <- fread("C:/Users/paslanpatir/Desktop/TEZ_v2/LHSample_Ad_Data_04122019.csv")
#    LHSample_Ad <- head(LHSample_Ad[`%-similar-wanted` < 90],300)
#
#}

Visualization

In [38]:
pca_training_set_Ad <- princomp(training_set_Ad[,-c("output")], cor = TRUE, scores = TRUE)
In [39]:
#fviz_pca_ind(pca_LHSample,
#             col.ind = "cos2", # Color by the quality of representation
#             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
#              geom="point"
#             )

pca_training_set_Ad_components <- get_pca_ind(pca_training_set_Ad)
pca_training_set_Ad_components <-cbind(pca_training_set_Ad_components$coord[,1:2],training_set_Ad[,c("output")])
p_training_set_Ad <- ggplot(data = pca_training_set_Ad_components, aes(x = Dim.1, y = Dim.2)) +
                     geom_point(aes(colour = output)) +
                     labs( title = "", legend = "output") 
p_training_set_Ad

Train & Test Metamodel

In [ ]:
# Decide on strategy:
#iteration_budget = 3

#h = 1 # specify how many variable will be eliminated in each elimination iteration
In [40]:
## initialize record tables Record train candidates
train_candidates_table = data.table()

# Record model performances
performance_table = data.table(iter = numeric(), mae = numeric(), rmse = numeric(), mape = numeric())

# Record obb_error table
obb_error = data.table(iter = numeric(), obb_error = numeric())

## initialize variables
# keep test set undistorted
predictedLabels_table = copy(test_set)
In [41]:
set.seed(10)
print(paste0("section start time : ",Sys.time()))
iter = 1
while(iter <= iteration_budget){   
    print(iter)

    trainx = training_set_Ad[,.SD, .SDcols = feature_names]
    trainy = training_set_Ad$output
    
    # Train the model
    model_Sub <- randomForest( x = trainx, y =  trainy,importance = TRUE,ntree = ntree, mtry = mtry)
    assign(paste0("model_Sub_",iter),model_Sub)
                    
    obb_error = rbind(obb_error,data.table(iter,obb_error_func(model_Sub)),use.names=FALSE)

    # test the model on test set
    test_predictions_Sub = get_test_predictions(model_Sub,test_set,error_type)
    predictedLabels_Sub = test_predictions_Sub[[1]]
    setnames(predictedLabels_Sub,c("pred_output",error_type), c(paste0("pred_output_",iter),paste0(error_type,"_",iter)))    
    predictedLabels_table = cbind(predictedLabels_table,predictedLabels_Sub[,.SD, .SDcols = c(paste0("pred_output_",iter),paste0(error_type,"_",iter))])
    
    # Keep test set error records
    performance_table = rbind(performance_table,data.table(iter,test_predictions_Sub[[2]]), use.names = FALSE)
    
    if(iter != iteration_budget){ # below efforts are unnecessary when the budget is reached.    
    ## sample selection from unlabeled data select candidates
        unlabeled_set <- copy(unlabeled_pool)
        train_candidates = sample_selection(selected_ins, unlabeled_set, model_Sub)
        
        # eliminate candidates from the unlabeled pool
        unlabeled_pool = unlabeled_pool[-train_candidates$idx]
        rm(unlabeled_set)
        
        # run ABM to find outputs of train candidates
        print(paste0("ABM train_candidate run start time : ",Sys.time()))
        train_candidates = run_ABM(nofrep, selected_ins, train_candidates)
        print(paste0("ABM train_candidate run end time : ",Sys.time()))
        
        train_candidates_table = rbind(train_candidates_table, data.table(train_candidates,iter = iter))
        
        # add labeled candidates to the train data
        training_set_Ad = rbind(training_set_Ad, train_candidates[, -c("idx")])
    }
    iter = iter + 1
}
print(paste0("section end time : ",Sys.time()))
[1] "2020-01-08 20:43:39 +03"
[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6
[1] 7
[1] 8
[1] 9
[1] 10
[1] 11
[1] 12
[1] 13
[1] 14
[1] 15
[1] 16
[1] 17
[1] 18
[1] 19
[1] 20
In [42]:
# Final records
FinalTrainData_Ad = copy(training_set_Ad)
performance_table_Ad = copy(performance_table)
train_candidates_table_Ad  = copy(train_candidates_table)
predictedLabels_table_Ad = copy(predictedLabels_table)
obb_error_Ad = copy(obb_error)
In [43]:
#fwrite(FinalTrainData_Ad,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/FinalTrainData_Ad_DummyCode_",Sys.Date(),".csv") )
#fwrite(performance_table_Ad,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/performance_table_Ad_DummyCode_",Sys.Date(),".csv") )
#fwrite(train_candidates_table_Ad,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/train_candidates_table_Ad_DummyCode_",Sys.Date(),".csv") )
#fwrite(predictedLabels_table_Ad,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/predictedLabels_table_Ad_DummyCode_",Sys.Date(),".csv") )
#fwrite(obb_error_Ad,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/obb_error_Ad_DummyCode_",Sys.Date(),".csv") )
In [44]:
nrow(FinalTrainData_Ad)
performance_table_Ad
train_candidates_table_Ad
head(predictedLabels_table_Ad)
obb_error_Ad
295
A data.table: 20 × 4
itermaermsemape
<dbl><dbl><dbl><dbl>
16.3146457.8917268.863016
26.1868707.5675098.791270
36.0632257.3479818.559153
45.6296926.8546767.924934
55.5032296.7039887.792226
65.7294416.9489828.036690
75.3701316.5382157.572908
85.2963106.5077447.491420
95.2045076.2985997.225636
105.0952476.1892287.098340
115.1162176.2281757.186941
125.0402596.1173457.114422
134.8790925.9192946.825544
144.8821065.9220806.827458
154.7638285.8327086.655991
164.8535475.9245486.827251
174.7202655.7894256.561778
184.5769985.5967936.395540
194.4997365.4984946.253749
204.5823895.6058116.381325
A data.table: 95 × 9
density%-similar-wantedbudget-multiplier-dummydensity-multiplier-dummynoise-dummytick-limitidxoutputiter
<dbl><dbl><dbl><dbl><dbl><dbl><int><dbl><dbl>
13.1629581.923704.9184420.907486705.156864e-05108.65117301 99.945061
10.4145789.912211.0360820.575976047.662668e-05109.04119587100.000001
13.3739588.403173.4491000.078683668.048873e-05108.15717471100.000001
44.5796074.762775.1211590.992226544.274789e-05106.00483289 99.852131
49.9058475.019957.6456170.547576427.311075e-05104.21166133 61.425221
66.0190088.679811.0508620.057447549.129029e-05109.94240579 50.467282
13.2865782.765492.9116500.859077509.799145e-05 93.38701596100.000002
56.9609381.453531.5283510.252687157.777971e-05 98.44494317 52.617832
10.0898989.720637.8217480.015112669.711906e-05 99.29901575100.000002
47.6103688.244851.0516500.972127479.549873e-05104.08723540 51.415302
55.4937089.490276.6556750.018260375.203562e-05 97.66010548 50.161183
20.5299685.404418.9078930.070903944.997140e-05107.09321304 71.286093
54.1635769.358884.6155260.749360271.294256e-05109.73904428 99.805323
89.2954260.672413.2827880.475271601.300112e-05109.27348397 98.280373
34.1186489.131361.1337510.342174792.966778e-05107.82599336 54.165003
40.5773861.181476.4572810.058807916.655425e-05109.86849522 99.103444
24.9075979.307426.4500800.777396682.313542e-05108.46347237 81.552364
77.1780071.764481.1689270.110217928.985509e-05 92.04914351 89.708794
25.1040089.860086.9431830.428231813.879462e-05107.34111560 58.820774
68.5810148.683841.1447120.426759659.054583e-05109.67926384 88.481934
81.1812989.352705.3600460.961220711.060612e-05108.52618556 50.146385
59.8607074.549273.6914020.698808774.577660e-05102.02923317 99.850025
16.7809384.758192.5196910.285559669.487103e-05 99.13079527 85.672665
26.1015582.083756.3854490.899478907.456322e-05 93.94113161 63.085985
54.7405875.141883.0143320.334246478.527155e-05107.89151123 58.427555
85.6352969.582924.9050560.955764714.283450e-05108.42570252 99.652516
81.0803050.381244.9665770.965711312.835659e-05102.00550497 94.427486
76.3082864.273508.8003930.290695053.300021e-05108.87203184 98.469926
87.7202555.365445.9867640.223239259.922546e-05109.31221489 94.918276
41.4066980.092681.2992380.491388654.259885e-05 97.99796547 53.004966
...........................
77.2638374.113631.4835390.931347399.585848e-05 93.74967521 80.7893814
66.8144276.910848.8250770.096571673.217130e-05 96.86412236 55.2622314
19.6406480.311135.9634040.679940629.017611e-05101.98207195 74.3126514
57.4309713.266059.6970170.727793131.456219e-05 98.80481470 55.5973214
87.9316856.619758.1193240.831644891.761321e-05 98.61217310 94.6667414
41.0467161.795519.7679580.559504921.503845e-05 93.73754476 99.1653615
68.1745351.625419.9395860.282363822.592262e-05100.91219380 95.5493315
12.3312986.386364.5039410.582801505.890082e-05100.46775419100.0000015
88.1672783.159767.1898730.985364181.119586e-05 95.50861394 51.3849015
16.2714178.630331.5870190.135303155.420034e-05 96.73401155 95.4817715
55.7059818.766954.7328310.051806809.828154e-05 97.41939464 59.3450216
83.3316082.994641.6350900.981656462.080065e-05 91.24540528 51.5111916
60.1176012.585456.1857480.993579125.083044e-05 92.50435475 54.3980816
38.3797489.545659.9645240.258210759.453912e-05106.31509469 52.8704316
37.6678281.645298.4839540.990955474.942339e-05109.39645488 55.5759516
61.4751155.191878.3246090.996053213.285353e-05 94.50943436 96.0040417
74.2946263.245214.5396190.986985173.862129e-05 93.32776348 98.6391317
45.7489768.004047.9871760.964219231.675996e-05102.26377319 99.8428117
69.3729254.126905.7833170.966624039.158785e-05100.63321485 95.2838417
64.4772455.427486.7000730.968545118.603576e-05108.93043296 95.7647017
31.0486984.713311.9983670.142777112.764362e-05 98.63503279 55.9622818
45.5675510.340292.7930650.572439841.090458e-05 96.65781430 58.8316718
85.9683548.476525.8379100.012671294.044349e-05100.55347477 87.5711118
66.9667962.695239.1788570.044299101.040186e-05108.01273493 98.7433618
58.3223814.303618.9641690.156282166.053122e-05 90.75668379 55.4140618
82.8657960.257038.3438940.200003175.378005e-05 91.34227239 98.2121219
71.1240040.521319.8897320.126746272.245635e-05 90.72959490 84.3760519
87.5467030.994449.8228830.788145502.293672e-05 91.22103484 74.2932719
83.9732212.074845.4488080.438539839.772312e-05103.49803415 51.3570519
25.6560678.272355.9095020.408160077.713984e-05 90.18903452 83.2641619
A data.table: 6 × 47
density%-similar-wantedbudget-multiplier-dummydensity-multiplier-dummynoise-dummytick-limitoutputpred_output_1RMSE_1pred_output_2...pred_output_16RMSE_16pred_output_17RMSE_17pred_output_18RMSE_18pred_output_19RMSE_19pred_output_20RMSE_20
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>...<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
89.8078163.221019.5593640.71832329.772682e-05 94.8546198.6208779.7825918.83827579.62909...88.4448610.17600989.194799.426081188.0067310.61413988.1284110.49245486.9574311.663442
65.9530427.517301.1139790.46118304.533282e-05 91.3538366.0909074.49817 8.40727175.74052...70.68430 4.59340271.076534.985631371.73028 5.63938269.37698 3.28608171.14292 5.052028
65.6374842.720067.4602210.24982484.748342e-05106.6083286.5381089.38440 2.84630389.28690...88.70574 2.16763488.280481.742383688.19619 1.65808988.75580 2.21769788.80661 2.268504
49.0847453.361871.0291110.32092412.648540e-05 95.3160297.0672092.74418 4.32302091.57756...91.93225 5.13494893.056924.010278391.99508 5.07212292.84195 4.22524992.10924 4.957961
82.3166632.863499.9374370.74247809.336225e-05 91.0006673.9935672.21350 1.78006472.05943...77.51320 3.51963373.677310.316255176.59384 2.60027775.84234 1.84877376.97699 2.983421
59.0871922.267415.2813720.73510981.573031e-05 98.7811463.6766869.65778 5.98109970.20041...68.10113 4.42445266.269632.592947268.31376 4.63708067.89492 4.21824167.23911 3.562430
A data.table: 20 × 2
iterobb_error
<dbl><dbl>
170.47199
269.23098
370.99077
464.35526
562.75405
667.19224
763.02613
861.32133
960.82498
1056.75684
1158.17025
1258.69549
1358.72205
1460.07439
1559.52274
1656.67833
1754.32064
1853.31757
1952.22112
2053.64916
In [90]:
performance_molten_Ad <- melt(data = performance_table_Ad
                             , id.vars = 'iter')
setnames(performance_molten_Ad, c("variable","value"),c("errortype","errorvalue"))
p_Ad = ggplot(performance_molten_Ad, aes(x = iter, y = errorvalue, group=errortype, col=errortype)) + 
            geom_line(lwd=1)+
            geom_hline(data = performance_molten_oneshot, aes(yintercept = errorvalue, group=errortype, col=errortype),stat = "hline", linetype = "dashed")
p_Ad

Final Visualization

In [91]:
pca_final_Ad_training_set <- princomp(FinalTrainData_Ad[,.SD, .SDcols = !c("output")], cor = TRUE, scores = TRUE)

pca_final_Ad_training_set_components <- get_pca_ind(pca_final_Ad_training_set)
pca_final_Ad_training_set_components <-cbind(pca_final_Ad_training_set_components$coord[,1:2],FinalTrainData_Ad[,.SD, .SDcols = c("output")])
p_final_Ad_training_set <- ggplot(data = pca_final_Ad_training_set_components, aes(x = Dim.1, y = Dim.2)) +
             geom_point(aes(colour = output)) +
             labs( title = "", legend = "output") 
p_final_Ad_training_set
In [92]:
grid.arrange(p_training_set_Ad,p_final_Ad_training_set, ncol=2)

3 Scenarios

In [93]:
nrow(FinalTrainData_Ad)
nrow(training_set)
nrow(FinalTrainData_Rd)
295
700
295
In [94]:
oneshot_model = randomForest( x = training_set[, -c("output")]
                             ,y = training_set$output
                             , importance = TRUE,ntree = ntree, mtry = mtry)
random_model = randomForest( x = FinalTrainData_Rd[,.SD, .SDcols = feature_names]
                              ,y = FinalTrainData_Rd$output
                              ,importance = TRUE,ntree = ntree, mtry = mtry)
adaptive_model = randomForest( x = FinalTrainData_Ad[,.SD, .SDcols = feature_names]
                              ,y = FinalTrainData_Ad$output
                              ,importance = TRUE,ntree = ntree, mtry = mtry)

oneshot_model
random_model
adaptive_model
Call:
 randomForest(x = training_set[, -c("output")], y = training_set$output,      ntree = ntree, mtry = mtry, importance = TRUE) 
               Type of random forest: regression
                     Number of trees: 400
No. of variables tried at each split: 2

          Mean of squared residuals: 28.19001
                    % Var explained: 90.72
Call:
 randomForest(x = FinalTrainData_Rd[, .SD, .SDcols = feature_names],      y = FinalTrainData_Rd$output, ntree = ntree, mtry = mtry,      importance = TRUE) 
               Type of random forest: regression
                     Number of trees: 400
No. of variables tried at each split: 2

          Mean of squared residuals: 49.72727
                    % Var explained: 83.01
Call:
 randomForest(x = FinalTrainData_Ad[, .SD, .SDcols = feature_names],      y = FinalTrainData_Ad$output, ntree = ntree, mtry = mtry,      importance = TRUE) 
               Type of random forest: regression
                     Number of trees: 400
No. of variables tried at each split: 2

          Mean of squared residuals: 52.81134
                    % Var explained: 84.1
In [95]:
pred_oneshot <- predict(oneshot_model, test_set)
pred_oneshot <- cbind(test_set,pred_oneshot)

pred_rd <- predict(random_model, test_set)
pred_rd <- cbind(test_set,pred_rd)

pred_ad <- predict(adaptive_model, test_set)
pred_ad <- cbind(test_set,pred_ad)

pred_oneshot[,RMSE := mapply(function(x,y) rmse_func(x,y),output,pred_oneshot)]
pred_rd[,RMSE := mapply(function(x,y) rmse_func(x,y),output,pred_rd)]
pred_ad[,RMSE := mapply(function(x,y) rmse_func(x,y),output,pred_ad)]                                                          
                        
rmse(pred_oneshot$output,pred_oneshot$pred_oneshot)
rmse(pred_rd$output,pred_rd$pred_rd)
rmse(pred_ad$output,pred_ad$pred_ad)
5.09705819125523
6.64499606563856
5.61263960111138
In [96]:
plot(oneshot_model$mse, type="l")
plot(random_model$mse, type="l")
plot(adaptive_model$mse, type="l")

Random Sampling vs Uncertainty Sampling

Final Data Comparison

In [ ]:
grid.arrange(p_training_set, p_training_set_Ad,p_final_Rd_training_set,p_final_Ad_training_set,nrow=2, ncol=2)

Performance Comparison

In [46]:
grid.arrange(p_Rd, p_Ad, ncol=2)
In [71]:
performance_molten_oneshot <- melt(data = performance_table_oneshot
                             , id.vars = 'iter')
setnames(performance_molten_oneshot, c("variable","value"),c("errortype","errorvalue"))

performance_Rd_vs_Ad = rbind(performance_molten_Rd[,.(iter,errortype,errorvalue, type = "Rd")],performance_molten_Ad[,.(iter,errortype,errorvalue, type = "Ad")])
p_Rd_vs_Ad = ggplot(performance_Rd_vs_Ad, aes(x = iter, y = errorvalue, group=errortype, col=errortype)) + 
            geom_line(lwd=1) +
            geom_hline(data = performance_molten_oneshot, aes(yintercept = errorvalue, group=errortype, col=errortype),stat = "hline", linetype = "dashed") +
            facet_wrap(~type)
p_Rd_vs_Ad
In [72]:
ggplotly(p_Rd_vs_Ad)
In [82]:
comp = performance_Rd_vs_Ad[iter == 20 & errortype =="rmse"]
comp[, oneshot_error := performance_molten_oneshot[errortype =="rmse"]$errorvalue]      
comp[,diff := (errorvalue - oneshot_error) ]
comp[,diff_perc := (errorvalue - oneshot_error) / oneshot_error ]
comp
A data.table: 2 × 7
itererrortypeerrorvaluetypeoneshot_errordiffdiff_perc
<dbl><fct><dbl><chr><dbl><dbl><dbl>
20rmse6.721750Rd5.0487571.67299370.3313675
20rmse5.605811Ad5.0487570.55705430.1103349

Adaptive Sampling & Feature Elimination

Generate Training Set

Select a relatively big data pool ( nofinstances should be medium) , like 400

In [47]:
training_set_Ad = copy(adaptive_initial_data)
unlabeled_pool =copy(data_candidates)
In [ ]:
#if(GenerateTTData == 1){
#   
#    LHSample_Ad = as.data.table(maximinLHS(n = train_ins_Ad, k = nofparams, dup = 5))
#    
#    LHSample_Ad$V1 = qunif(LHSample_Ad$V1, 10, 90) 
#    LHSample_Ad$V2 = qunif(LHSample_Ad$V2, 10, 90) 
#    setnames(LHSample_Ad, c("V1","V2"), feature_names)
#    LHSample_Ad$output <- 0.00
#    
#    paste0("ABM run start time : ",Sys.time())
#    LHSample_Ad = run_ABM(nofrep,train_ins_Ad,LHSample_Ad) %>% as.data.table()
#    paste0("ABM run end time : ",Sys.time())
#    
#    fwrite(LHSample_Ad, paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/LHSample_Ad_Data",Sys.Date(),".csv"))
#
#}else{
#    LHSample_Ad <- fread("C:/Users/paslanpatir/Desktop/TEZ_v2/LHSample_Ad_Data_04122019.csv")
#    LHSample_Ad <- head(LHSample_Ad[`%-similar-wanted` < 90],300)
#
#}

Visualization

In [48]:
pca_training_set_Ad <- princomp(training_set_Ad[,-c("output")], cor = TRUE, scores = TRUE)
In [49]:
#fviz_pca_ind(pca_LHSample,
#             col.ind = "cos2", # Color by the quality of representation
#             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
#              geom="point"
#             )

pca_training_set_Ad_components <- get_pca_ind(pca_training_set_Ad)
pca_training_set_Ad_components <-cbind(pca_training_set_Ad_components$coord[,1:2],training_set_Ad[,c("output")])
p_training_set_Ad <- ggplot(data = pca_training_set_Ad_components, aes(x = Dim.1, y = Dim.2)) +
                     geom_point(aes(colour = output)) +
                     labs( title = "", legend = "output") 
p_training_set_Ad

Train & Test Metamodel

In [52]:
# Decide on strategy:
sample_selection_iteration_order = c(1:19)
feature_elimination_iteration_order = c(16:19)
#iteration_budget = 3 # should be > max(max(sample_selection_iteration_order),max(feature_elimination_iteration_order))
#
#h = 1 # specify how many variable will be eliminated in each elimination iteration
In [54]:
feature_names
feature_elimination_iteration_order
  1. 'density'
  2. '%-similar-wanted'
  3. 'budget-multiplier-dummy'
  4. 'density-multiplier-dummy'
  5. 'noise-dummy'
  6. 'tick-limit'
  1. 16
  2. 17
  3. 18
  4. 19
In [55]:
## initialize record tables Record train candidates
train_candidates_table = data.table()

# Record model performances
performance_table = data.table(iter = numeric(), mae = numeric(), rmse = numeric(), mape = numeric())

# Record obb_error table
obb_error = data.table(iter = numeric(), obb_error = numeric())

# Record iteration history
iteration_history = data.table(iter_no = numeric(), IsFeatureEliminated = logical(), IsDataSelected = logical())

## initialize variables
# keep test set undistorted
predictedLabels_table = copy(test_set)

# specify variables(columns) to be used initialize
columns_left = feature_names
total_numof_eliminated_vars <- 0
In [56]:
set.seed(10)
print(paste0("section start time : ",Sys.time()))
iter = 1
while (iter <= iteration_budget) {
    
    trainx = training_set_Ad[, .SD, .SDcols = columns_left]
    trainy = training_set_Ad$output
    
    # Train the model
    model_Sub <- randomForest(x = trainx, y = trainy, importance = TRUE, ntree = ntree, mtry = mtry)
    assign(paste0("model_Sub_", iter), model_Sub)
    
    if (length(columns_left) == length(feature_names)) {
        ranked_features = get_variable_importance(model_Sub)
    }
    # Keep training set error records
    obb_error = rbind(obb_error, data.table(iter, obb_error_func(model_Sub)), use.names = FALSE)
    
    # Test the model on test set
    test_predictions_Sub = get_test_predictions(model_Sub, test_set, error_type)
    predictedLabels_Sub = test_predictions_Sub[[1]]
    setnames(predictedLabels_Sub, c("pred_output", error_type), c(paste0("pred_output_", iter), paste0(error_type, "_", iter)))
    predictedLabels_table = cbind(predictedLabels_table, predictedLabels_Sub[,.SD, .SDcols = c(paste0("pred_output_", iter), paste0(error_type, "_", iter))])
    
    # Keep test set error records
    performance_table = rbind(performance_table, data.table(iter, test_predictions_Sub[[2]]), use.names = FALSE)
    
    # update iteration_history
    iteration_history = rbind(iteration_history, data.table(iter, 0, 0), use.names = FALSE)
    
    if(iter != iteration_budget){ # below efforts are unnecessary when the budget is reached.
          if (iter %in% sample_selection_iteration_order) {
              ## sample selection from unlabeled data select candidates
              unlabeled_set <- copy(unlabeled_pool)
              train_candidates = sample_selection(selected_ins, unlabeled_set, model_Sub)
              
              # eliminate candidates from the unlabeled pool
              unlabeled_pool = unlabeled_pool[-train_candidates$idx]
              rm(unlabeled_set)
              
              # run ABM to find outputs of train candidates
              print(paste0("ABM train_candidate run start time : ",Sys.time()))
              train_candidates = run_ABM(nofrep, selected_ins, train_candidates)
              print(paste0("ABM train_candidate run end time : ",Sys.time()))
              
              train_candidates_table = rbind(train_candidates_table, data.table(train_candidates,iter = iter))
              
              # add labeled candidates to the train data
              training_set_Ad = rbind(training_set_Ad, train_candidates[, -c("idx")])
              
              # update iteration_history
               iteration_history[iter]$IsDataSelected= 1
          }
          if (iter %in% feature_elimination_iteration_order) {
              ## feature elimination apply feature elimination
              feature_elimination_result = feature_elimination(h, total_numof_eliminated_vars, ranked_features)
              
              columns_left = feature_elimination_result[[1]]  # 
              eliminated_columns = feature_elimination_result[[4]]  #   not necessary
              total_numof_eliminated_vars = as.numeric(feature_elimination_result[2])
              numof_eliminated_vars = as.numeric(feature_elimination_result[3])  #   not necessary 
              
              # update iteration_history
              iteration_history[iter]$IsFeatureEliminated= 1
          }
    }
iter = iter + 1  
}
print(paste0("section end time : ",Sys.time()))
In [ ]:
#if(iter %in% sample_selection_iteration_order){
#   ## sample selection from unlabeled data
#   # select candidates
#   unlabeled_set <- copy(unlabeled_pool)
#   train_candidates = sample_selection(selected_ins,unlabeled_set,model_Sub)
#   
#   # eliminate candidates from the unlabeled pool
#   unlabeled_pool = unlabeled_pool[- train_candidates$idx]
#   rm(unlabeled_set)
#   
#   # run ABM to find outputs of train candidates
#   paste0("ABM train_candidate run start time : ",Sys.time())
#   train_candidates = run_ABM(nofrep,selected_ins,train_candidates)
#   paste0("ABM train_candidate run end time : ",Sys.time())
#   
#   train_candidates_table = rbind(train_candidates_table, data.table(train_candidates, "iter" = iter))
#   
#   # add labeled candidates to the train data
#   LHSample_Ad = rbind(LHSample_Ad,train_candidates[,-c("idx")])
#   
#   # update iteration_history
#   iteration_history[iter == iter,IsDataSelected := 1 ]
#}
In [ ]:
#if(iter %in% feature_elimnation_iteration_order){
#   ## feature elimination
#   # apply feature elimination
#       feature_elimination_result = feature_elimination(h,total_numof_eliminated_vars,ranked_features)
#   
#       columns_left = feature_elimination_result[[1]]# 
#       eliminated_columns = feature_elimination_result[[4]]#   not necessary
#       total_numof_eliminated_vars = as.numeric(feature_elimination_result[2])  
#       numof_eliminated_vars = as.numeric(feature_elimination_result[3])#   not necessary 
#   
#   # update iteration_history
#   iteration_history["iter" == iter,IsFeatureEliminated := 1 ]
#}
In [ ]:
#performance_error_table = performance_table[,.SD,.SDcols = c("iter",tolower(error_type))]
#setnames(performance_error_table,c("iter","error"))
#performance_error_table[, lag_error := shift(error,1,type = "lag")]
#performance_error_table
In [62]:
columns_left
  1. '%-similar-wanted'
  2. 'density'
In [57]:
# Final records
FinalTrainData_AdFe = copy(training_set_Ad)
performance_table_AdFe = copy(performance_table)
train_candidates_table_AdFe  = copy(train_candidates_table)
predictedLabels_table_AdFe = copy(predictedLabels_table)
obb_error_AdFe = copy(obb_error)
In [58]:
#fwrite(FinalTrainData_AdFe,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/FinalTrainData_AdFe_DummyCode_",Sys.Date(),".csv") )
#fwrite(performance_table_AdFe,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/performance_table_AdFe_DummyCode_",Sys.Date(),".csv") )
#fwrite(train_candidates_table_AdFe,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/train_candidates_table_AdFe_DummyCode_",Sys.Date(),".csv") )
#fwrite(predictedLabels_table_AdFe,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/predictedLabels_table_AdFe_DummyCode_",Sys.Date(),".csv") )
#fwrite(obb_error_AdFe,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/obb_error_AdFe_DummyCode_",Sys.Date(),".csv") )
In [59]:
nrow(FinalTrainData_AdFe)
performance_table_AdFe
train_candidates_table_AdFe
head(predictedLabels_table_AdFe)
obb_error_AdFe
295
A data.table: 20 × 4
itermaermsemape
<dbl><dbl><dbl><dbl>
16.1099347.6924288.606045
26.0463587.5271568.464582
35.8576577.2543088.228036
45.7262807.0377808.015131
55.7041057.0719507.962050
65.6516086.9231637.950652
75.4482636.6650117.678513
85.4013876.6293177.624565
95.2656356.4870697.351330
105.4525986.6531717.624294
115.2148316.4386457.299752
125.2325076.4533627.318562
135.2718086.4600807.329394
145.0828136.2526467.043563
155.0381326.2048036.944509
164.9650796.1996486.907999
174.0264835.2238315.565957
182.8685653.9784483.999287
191.9599192.9763292.653645
201.4996782.6239562.040339
A data.table: 95 × 9
density%-similar-wantedbudget-multiplier-dummydensity-multiplier-dummynoise-dummytick-limitidxoutputiter
<dbl><dbl><dbl><dbl><dbl><dbl><int><dbl><dbl>
65.2298677.932161.5948930.586424547.646208e-05103.04365414 55.138951
52.7726174.381229.3104900.437773605.293214e-05108.28870401 99.859231
86.8924761.693216.1287570.435170824.593647e-05108.78252477 98.347641
46.8244875.419333.6752210.362212387.882518e-05100.95472 17 62.768761
74.4286175.697073.2528360.432996618.867011e-05 97.83753179 52.562201
83.7991659.364554.2926760.178183509.957560e-05 95.69266414 97.362602
11.7437980.453368.9753410.589728888.137752e-05 98.39805467100.000002
79.5035287.646971.5157020.923534044.807250e-05103.42913281 50.126182
77.9911072.725104.9945700.820376718.099046e-05 92.24192119 82.286402
26.3372787.510246.5362200.978315021.702649e-05 92.92470424 60.253442
21.8408681.025148.7856280.669511306.048680e-05 94.57482 90 67.938663
35.8148882.163214.1912610.619513413.935635e-05106.83474164 54.935293
41.5530279.902648.7534590.685002384.525147e-05100.36608146 70.069393
59.0322476.104125.9198260.175946923.370207e-05107.85430389 57.188583
20.7677580.641807.8528500.330095624.629115e-05 93.27826235 72.412223
84.3460863.585793.5152460.245944962.370941e-05101.28581273 98.507904
54.1006679.742645.5245600.220484718.476829e-05 91.26807265 58.652554
86.1401052.779771.6970820.023884234.728572e-05108.34975449 94.711144
50.4011966.549581.7107720.346787745.129994e-05109.54256452 99.002524
83.4399260.998431.2191590.948148605.384972e-05 98.55848421 98.263454
78.8202567.505972.8220170.599542273.189739e-05102.29049 49 99.693025
15.5503456.347201.8525040.984213265.975317e-05100.86490455 99.810055
15.9515048.234619.0368810.995041119.679117e-05103.00932392 95.775755
20.9037937.955265.5409310.980096593.181878e-05 93.06444391 93.032085
78.9413012.408311.3642670.292185816.682903e-05109.33877479 51.604375
63.1147945.596663.5744530.182498716.585885e-05109.61623308 89.522736
16.4346750.583192.6428430.526737783.244350e-05109.83209403 99.442346
87.1615284.628496.4981120.086453001.809380e-05 97.18913382 50.551596
18.5608678.101912.1635740.361785266.327820e-05105.09946250 91.195756
18.8167710.801511.1043500.424377698.685976e-05109.78843391 78.391036
...........................
32.9573664.552406.9554060.612670809.692770e-05107.6422822099.1544814
37.8739487.328569.5474980.962429325.016641e-05 95.0656143553.0660914
83.1140869.727144.4384710.100265496.212719e-05 93.1560433299.6550514
56.1421967.746349.0555920.674772279.194763e-05108.6777236799.7417814
51.1012441.581139.7391190.138363568.075575e-05 95.4644532288.8528814
39.8614275.495297.5384980.183283796.666039e-05 93.4649114367.9564215
48.8399555.082581.6742430.923684825.647778e-05 91.8437238997.0360815
51.1980766.978988.7081580.212900242.709581e-05 90.8867929599.8010115
35.2757277.138066.4786450.281451458.277539e-05 99.7406933471.6671015
23.2594676.517393.1597070.775642816.807990e-05103.98821 4682.9297315
47.1184317.042983.4073760.272201502.059711e-05 90.6196427161.0394316
68.9996373.339003.4642600.754938038.389796e-05 98.76016 7399.8745716
88.7632115.082476.8900200.721333099.932740e-05 95.4384441655.2508816
61.7898244.286541.2532610.033374088.585987e-05102.5038140889.5303016
54.5722415.583779.4801850.374122168.736561e-05101.1952838756.4979616
44.9139673.472262.3428930.274203591.720096e-05103.7313311699.8540317
52.0189572.570639.4975060.393836725.631709e-05 95.7341537399.8368417
21.4991039.538087.6747220.075054519.891580e-05100.7776437193.3297817
84.5173472.111806.6337560.401761246.462193e-05 98.2276236375.5154317
23.5811578.990896.8415220.243797601.495098e-05 98.5161522282.8858717
72.4417272.988833.6166210.275656315.339539e-05107.9310113797.7660218
76.0708471.696915.1953810.371281057.906183e-05 93.13224 6883.5694118
47.3001112.012326.5251500.405395521.753875e-05103.8754540358.0292118
73.8508846.015944.8663070.808496869.978542e-05 90.5603736488.7615718
31.5446284.407786.0574260.780238155.844023e-05103.6149914456.3468318
24.5648880.009025.5543000.557734654.148475e-05 92.8930410463.7391019
79.0819658.674728.1039760.805558569.261097e-05104.2819018197.0454119
80.8966970.862222.0543270.517708065.525847e-05 96.7048516899.7119619
48.4564933.399295.5383850.404456827.032383e-05106.6321610185.5533719
86.4187757.127638.0439020.214516659.323104e-05 94.7851621594.6494219
A data.table: 6 × 47
density%-similar-wantedbudget-multiplier-dummydensity-multiplier-dummynoise-dummytick-limitoutputpred_output_1RMSE_1pred_output_2...pred_output_16RMSE_16pred_output_17RMSE_17pred_output_18RMSE_18pred_output_19RMSE_19pred_output_20RMSE_20
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>...<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
89.8078163.221019.5593640.71832329.772682e-05 94.8546198.6208778.7423819.87848479.30676...82.9789815.641884782.0894716.531399989.196029.424845391.051257.569613095.449383.1714921
65.9530427.517301.1139790.46118304.533282e-05 91.3538366.0909075.47925 9.38835475.01140...74.62278 8.531888273.55249 7.461589171.752465.661561371.103165.012260270.033383.9424801
65.6374842.720067.4602210.24982484.748342e-05106.6083286.5381090.07828 3.54018190.10383...88.83122 2.293121088.18868 1.650579187.923521.385418387.754561.216460488.025851.4877443
49.0847453.361871.0291110.32092412.648540e-05 95.3160297.0672092.66290 4.40430291.79646...92.08382 4.983378693.00067 4.066523495.848041.219161096.157270.909923396.907100.1601008
82.3166632.863499.9374370.74247809.336225e-05 91.0006673.9935672.81416 1.17940373.72003...74.39399 0.400428673.86088 0.132688774.447590.454025175.671941.678377174.779500.7859376
59.0871922.267415.2813720.73510981.573031e-05 98.7811463.6766869.83373 6.15705570.05308...69.61065 5.933968166.61419 2.937514165.979602.302921164.302480.625800763.175320.5013579
A data.table: 20 × 2
iterobb_error
<dbl><dbl>
168.08468
267.27775
367.02113
463.43798
564.80633
664.43386
759.22362
860.19234
956.61025
1060.14894
1156.42037
1258.50032
1357.89785
1453.33422
1555.25109
1653.88430
1737.98917
1825.80777
1915.26414
2012.41144
In [60]:
iteration_history
A data.table: 20 × 3
iter_noIsFeatureEliminatedIsDataSelected
<dbl><dbl><dbl>
101
201
301
401
501
601
701
801
901
1001
1101
1201
1301
1401
1501
1611
1711
1811
1911
2000
In [61]:
performance_molten_AdFe <- melt(data = performance_table_AdFe
                             , id.vars = 'iter')
setnames(performance_molten_AdFe, c("variable","value"),c("errortype","errorvalue"))
p_AdFe = ggplot(performance_molten_AdFe, aes(x = iter, y = errorvalue, group=errortype, col=errortype)) + 
            geom_line(lwd=1) +
            geom_vline(xintercept = iteration_history[IsFeatureEliminated==1]$iter_no + 1, linetype = "dashed") +
            geom_vline(xintercept = iteration_history[IsDataSelected==1]$iter_no + 1, linetype = "dotdash",color = "yellow")
p_AdFe

Adaptive Sampling with / without Feature Elimination

In [63]:
grid.arrange(p_Ad, p_AdFe, ncol=2)
In [73]:
performance_Ad_vs_AdFe = rbind(performance_molten_Ad[,.(iter,errortype,errorvalue, type = "Ad")], performance_molten_AdFe[,.(iter,errortype,errorvalue, type = "AdFe")])
p_Ad_vs_AdFe = ggplot(performance_Ad_vs_AdFe, aes(x = iter, y = errorvalue, group=errortype, col=errortype)) + 
            geom_line(lwd=1) +
            geom_vline(xintercept = iteration_history[IsFeatureEliminated==1]$iter_no + 1, linetype = "dashed") +
            geom_hline(data = performance_molten_oneshot, aes(yintercept = errorvalue, group=errortype, col=errortype),stat = "hline", linetype = "dashed") +
            facet_wrap(~type)
p_Ad_vs_AdFe
In [74]:
ggplotly(p_Ad_vs_AdFe)
In [ ]:
#ggplotGrob(p_AdFe)
In [85]:
varImpPlot(model_Sub_1)

Quit NL

In [ ]:
NLQuit(nl.obj = nl.model)
#NLQuit(all=FALSE)